In [ ]:
%run "../Functions/3. Per session and per user analysis.ipynb"
print("4. User comparison")
In [ ]:
def getAllUsers( dataframe ):
allUserIds = np.array(dataframe['userId'].unique())
allUserIds = [i for i in allUserIds if not i in ['nan', np.nan, 'null']]
return allUserIds
In [ ]:
# _source is used as correction source, if we want to include answers to these questions
def getAllUserVectorData( userIds, _rmDF, _gfDF, _source = correctAnswers, _printDebug = True, _binary=True):
# result
isInitialized = False
allData = []
f = FloatProgress(min=0, max=len(userIds))
display(f)
for userId in userIds:
#print(str(userId))
f.value += 1
dataVector = getUserDataVector(userId, _rmDF = _rmDF, _gfDF = _gfDF, _source = _source, _printDebug = _printDebug, _binary=_binary)
if not isInitialized:
isInitialized = True
allData = dataVector
else:
allData = pd.concat([allData, dataVector], axis=1)
f.close()
del f
#print('done')
return allData
In [ ]:
def getAllUserVectorDataCustom(_rmDF, _gfDF, before, after, gfMode = False, rmMode = True, sessionCount = 1):
userIds = []
if (before and after):
userIds = getSurveysOfUsersWhoAnsweredBoth(_gfDF, gfMode = gfMode, rmMode = rmMode)
elif before:
if rmMode:
userIds = getRMBefores(_gfDF)
else:
userIds = getGFBefores(_gfDF)
elif after:
if rmMode:
userIds = getRMAfters(_gfDF)
else:
userIds = getGFormAfters(_gfDF)
if(len(userIds) > 0):
userIds = userIds[localplayerguidkey]
allUserVectorData = getAllUserVectorData(userIds, _rmDF = _rmDF, _gfDF = _gfDF)
allUserVectorData = allUserVectorData.T
result = allUserVectorData[allUserVectorData['sessionsCount'] == sessionCount].T
return result
else:
print("no matching user")
return []
In [ ]:
methods = ['pearson', 'kendall', 'spearman']
def plotAllUserVectorDataCorrelationMatrix(
_allUserVectorData,
_method = methods[0],
_title='RedMetrics Correlations',
_abs=False,
_clustered=False,
_figsize = (20,20),
columnSubset=[]
):
_progress = FloatProgress(min=0, max=4)
display(_progress)
# computation of subset
if len(columnSubset) > 0 and pd.Series(columnSubset).isin(_allUserVectorData.columns).all():
_allUserVectorData = _allUserVectorData.loc[:,columnSubset]
# computation of correlation matrix
_m = _method
if(not (_method in methods)):
_m = methods[0]
_correlation = _allUserVectorData.astype(float).corr(_m)
_progress.value += 1
if(_abs):
_correlation = _correlation.abs()
_progress.value += 1
vmin=-1
if _abs:
vmin=0
vmax=1
# plot
if(_clustered):
# removing NaNs
# can't cluster NaN lines in _correlation
# copied/pasted from '2. Google form analysis.ipynb' plotCorrelationMatrix
_notNaNsIndices = []
_notNaNsColumns = []
for index in _correlation.index:
if(~pd.isnull(_correlation.loc[index,:]).all()):
_notNaNsIndices.append(index)
_correlation = _correlation.loc[_notNaNsIndices,_notNaNsIndices]
_progress.value += 1
sns.clustermap(
_correlation,
cmap=plt.cm.jet,
square=True,
figsize=_figsize,
vmin=vmin,
vmax=vmax,
)
else:
_fig = plt.figure(figsize=_figsize)
_ax = plt.subplot(111)
_ax.set_title(_title)
_progress.value += 1
sns.heatmap(
_correlation,
ax=_ax,
cmap=plt.cm.jet,
square=True,
vmin=vmin,
vmax=vmax,
)
_progress.value += 1
In [ ]:
def getPercentageCrossCorrect(binarized, figsize=(40,100)):
cbar_kws = dict(orientation= "horizontal")
#cbar_kws = dict(orientation= "horizontal",location="top")
#cbar_kws = dict(orientation= "horizontal", position="top")
intermediaryNumerator = getCrossCorrectAnswers(binarized).round().astype(int)*100
percentagesCrossCorrect = (intermediaryNumerator / binarized.shape[0]).round().astype(int)
_fig = plt.figure(figsize=figsize)
_ax = plt.subplot(121)
_ax.set_title('percentage correct')
sns.heatmap(
percentagesCrossCorrect,
ax=_ax,
cmap=plt.cm.jet,
square=True,
annot=True,
fmt='d',
cbar_kws=cbar_kws,
vmin=0,
vmax=100,
)
totalPerQuestion = np.dot(np.ones(binarized.shape[0]), binarized)
totalPerQuestion[totalPerQuestion == 0] = 1
percentagesConditionalCrossCorrect = (intermediaryNumerator / totalPerQuestion).round().astype(int).fillna(0)
_ax = plt.subplot(122)
_ax.set_title('percentage correct, conditionnally: p(y | x)')
sns.heatmap(
percentagesConditionalCrossCorrect,
ax=_ax,
cmap=plt.cm.jet,
square=True,
annot=True,
fmt='d',
cbar_kws=cbar_kws,
vmin=0,
vmax=100,
)
plt.tight_layout()
In [ ]:
def getCompletedRate(_rmdf):
players = _rmdf[QUserId].nunique()
completers = _rmdf[_rmdf['type'] == 'complete'][QUserId].nunique()
return float(completers)/float(players)
In [ ]:
allBinaryUserVectorDataPath = dataFolderPath + "allBinaryUserVectorData/"
allNumericUserVectorDataPath = dataFolderPath + "allNumericUserVectorData/"
In [ ]:
def getAllDataCSVPath(filePathStem, binary=True):
if binary:
return allBinaryUserVectorDataPath + filePathStem + csvSuffix
return allNumericUserVectorDataPath + filePathStem + csvSuffix
In [ ]:
def loadAllDataCSV(filePathStem, binary=True):
currentDF = pd.read_csv(getAllDataCSVPath(filePathStem, binary=binary), dtype=str)
if currentDF.columns[0] == 'Unnamed: 0':
currentDF.index = currentDF.loc[:,'Unnamed: 0']
del currentDF.index.name
currentDF = currentDF.drop('Unnamed: 0', axis='columns')
currentDF = currentDF.apply(np.float64)
return currentDF
In [ ]:
def saveAllDataCSV(allData, filePathStem, binary=True):
allData.to_csv(getAllDataCSVPath(filePathStem, binary=binary), encoding=csvEncoding)
In [ ]:
regenerateData = False
if regenerateData:
allBinaryDataPlaytestPhase1PretestPosttestUniqueProfiles = getAllUserVectorData(
getAllResponders(gfdfPlaytestPhase1PretestPosttestUniqueProfiles),
_rmDF = rmdfPlaytestPhase1PretestPosttestUniqueProfiles,
_gfDF = gfdfPlaytestPhase1PretestPosttestUniqueProfiles,
_source = correctAnswers + demographicAnswers,
_binary=True )
allBinaryDataPlaytestPhase1PretestPosttestUniqueProfilesVolunteers = getAllUserVectorData(
getAllResponders(gfdfPlaytestPhase1PretestPosttestUniqueProfilesVolunteers),
_rmDF = rmdfPlaytestPhase1PretestPosttestUniqueProfilesVolunteers,
_gfDF = gfdfPlaytestPhase1PretestPosttestUniqueProfilesVolunteers,
_source = correctAnswers + demographicAnswers,
_binary=True )
allBinaryDataPlaytestPhase2PretestPosttestUniqueProfiles = getAllUserVectorData(
getAllResponders(gfdfPlaytestPhase2PretestPosttestUniqueProfiles),
_rmDF = rmdfPlaytestPhase2PretestPosttestUniqueProfiles,
_gfDF = gfdfPlaytestPhase2PretestPosttestUniqueProfiles,
_source = correctAnswers + demographicAnswers,
_binary=True )
allBinaryDataPlaytestPhase2PretestPosttestUniqueProfilesVolunteers = getAllUserVectorData(
getAllResponders(gfdfPlaytestPhase2PretestPosttestUniqueProfilesVolunteers),
_rmDF = rmdfPlaytestPhase2PretestPosttestUniqueProfilesVolunteers,
_gfDF = gfdfPlaytestPhase2PretestPosttestUniqueProfilesVolunteers,
_source = correctAnswers + demographicAnswers,
_binary=True )
saveAllDataCSV(allBinaryDataPlaytestPhase1PretestPosttestUniqueProfiles, "PlaytestPhase1PretestPosttestUniqueProfiles", binary=True)
saveAllDataCSV(allBinaryDataPlaytestPhase1PretestPosttestUniqueProfilesVolunteers, "PlaytestPhase1PretestPosttestUniqueProfilesVolunteers", binary=True)
saveAllDataCSV(allBinaryDataPlaytestPhase2PretestPosttestUniqueProfiles, "PlaytestPhase2PretestPosttestUniqueProfiles", binary=True)
saveAllDataCSV(allBinaryDataPlaytestPhase2PretestPosttestUniqueProfilesVolunteers, "PlaytestPhase2PretestPosttestUniqueProfilesVolunteers", binary=True)
else:
allBinaryDataPlaytestPhase1PretestPosttestUniqueProfiles = loadAllDataCSV("PlaytestPhase1PretestPosttestUniqueProfiles", binary=True)
allBinaryDataPlaytestPhase1PretestPosttestUniqueProfilesVolunteers = loadAllDataCSV("PlaytestPhase1PretestPosttestUniqueProfilesVolunteers", binary=True)
allBinaryDataPlaytestPhase2PretestPosttestUniqueProfiles = loadAllDataCSV("PlaytestPhase2PretestPosttestUniqueProfiles", binary=True)
allBinaryDataPlaytestPhase2PretestPosttestUniqueProfilesVolunteers = loadAllDataCSV("PlaytestPhase2PretestPosttestUniqueProfilesVolunteers", binary=True)
if regenerateData:
allNumericDataPlaytestPhase1PretestPosttestUniqueProfiles = getAllUserVectorData(
getAllResponders(gfdfPlaytestPhase1PretestPosttestUniqueProfiles),
_rmDF = rmdfPlaytestPhase1PretestPosttestUniqueProfiles,
_gfDF = gfdfPlaytestPhase1PretestPosttestUniqueProfiles,
_source = correctAnswers + demographicAnswers,
_binary=False )
allNumericDataPlaytestPhase1PretestPosttestUniqueProfilesVolunteers = getAllUserVectorData(
getAllResponders(gfdfPlaytestPhase1PretestPosttestUniqueProfilesVolunteers),
_rmDF = rmdfPlaytestPhase1PretestPosttestUniqueProfilesVolunteers,
_gfDF = gfdfPlaytestPhase1PretestPosttestUniqueProfilesVolunteers,
_source = correctAnswers + demographicAnswers,
_binary=False )
allNumericDataPlaytestPhase2PretestPosttestUniqueProfiles = getAllUserVectorData(
getAllResponders(gfdfPlaytestPhase2PretestPosttestUniqueProfiles),
_rmDF = rmdfPlaytestPhase2PretestPosttestUniqueProfiles,
_gfDF = gfdfPlaytestPhase2PretestPosttestUniqueProfiles,
_source = correctAnswers + demographicAnswers,
_binary=False )
allNumericDataPlaytestPhase2PretestPosttestUniqueProfilesVolunteers = getAllUserVectorData(
getAllResponders(gfdfPlaytestPhase2PretestPosttestUniqueProfilesVolunteers),
_rmDF = rmdfPlaytestPhase2PretestPosttestUniqueProfilesVolunteers,
_gfDF = gfdfPlaytestPhase2PretestPosttestUniqueProfilesVolunteers,
_source = correctAnswers + demographicAnswers,
_binary=False )
saveAllDataCSV(allNumericDataPlaytestPhase1PretestPosttestUniqueProfiles, "PlaytestPhase1PretestPosttestUniqueProfiles", binary=False)
saveAllDataCSV(allNumericDataPlaytestPhase1PretestPosttestUniqueProfilesVolunteers, "PlaytestPhase1PretestPosttestUniqueProfilesVolunteers", binary=False)
saveAllDataCSV(allNumericDataPlaytestPhase2PretestPosttestUniqueProfiles, "PlaytestPhase2PretestPosttestUniqueProfiles", binary=False)
saveAllDataCSV(allNumericDataPlaytestPhase2PretestPosttestUniqueProfilesVolunteers, "PlaytestPhase2PretestPosttestUniqueProfilesVolunteers", binary=False)
else:
allNumericDataPlaytestPhase1PretestPosttestUniqueProfiles = loadAllDataCSV("PlaytestPhase1PretestPosttestUniqueProfiles", binary=False)
allNumericDataPlaytestPhase1PretestPosttestUniqueProfilesVolunteers = loadAllDataCSV("PlaytestPhase1PretestPosttestUniqueProfilesVolunteers", binary=False)
allNumericDataPlaytestPhase2PretestPosttestUniqueProfiles = loadAllDataCSV("PlaytestPhase2PretestPosttestUniqueProfiles", binary=False)
allNumericDataPlaytestPhase2PretestPosttestUniqueProfilesVolunteers = loadAllDataCSV("PlaytestPhase2PretestPosttestUniqueProfilesVolunteers", binary=False)
In [ ]:
allDataPlaytestPhase1PretestPosttestUniqueProfiles = allBinaryDataPlaytestPhase1PretestPosttestUniqueProfiles
allDataPlaytestPhase1PretestPosttestUniqueProfilesVolunteers = allBinaryDataPlaytestPhase1PretestPosttestUniqueProfilesVolunteers
allDataPlaytestPhase2PretestPosttestUniqueProfiles = allBinaryDataPlaytestPhase2PretestPosttestUniqueProfiles
allDataPlaytestPhase2PretestPosttestUniqueProfilesVolunteers = allBinaryDataPlaytestPhase2PretestPosttestUniqueProfilesVolunteers